ptt_dt[, category :=
stringr::str_extract(title, "^\\[([^]]+?)\\]")]ptt_dt[!is.na(category), .N, by = .(category)][order(-N)] %>% head(20)top20 <- ptt_dt[!is.na(category), .N, by = .(category)][order(-N)] %>% head(20)
d <- ptt_dt[category %in% top20$category, .(category, post_time)][
, category := factor(category, levels = top20$category)][
, post_date := as.Date(post_time)]
ggplot(d, aes(x = post_date, color = category, fill = category)) +
stat_density(geom = "area", alpha = 0.2) +
# stat_bin(binwidth = 30, geom = "area", alpha = 0.2) +
# scale_color_brewer(palette = "Set2") +
# scale_fill_brewer(palette = "Set2") +
ggtitle("PTT Keelung 發文種類分佈") +
guides(color = guide_legend(title = NULL, label.position = "top",
nrow=2, byrow = TRUE),
fill = FALSE) +
facet_wrap(~ category, ncol = 5) +
theme_bw() +
theme(legend.position="none", text = element_text(family = "STHeiti",
size = 14))key_term <- dtm_train_tfidf %>% find_freq_terms(3) %>%
colSums() %>%
data.frame() %>%
data.table(keep.rownames = TRUE) %>%
setnames(c("keyword", "sum_tf_idf")) %>%
.[order(-sum_tf_idf)]
key_term %>% head(100) %>% DT::datatable()d <- key_term %>% head(200)
ncolor <- nrow(d)
getPalette = colorRampPalette(RColorBrewer::brewer.pal(8, "Set2"))
wordcloud2(d,
size = 0.5,
fontFamily = "Noto Sans CJK TC",
fontWeight = "normal",
rotateRatio = 0,
color = getPalette(ncolor),
shape = "circle")# Preprocessing ------------------------------------------------
doc.list <- post_text_seg
## tf-idf
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm, tfidf)
# tfidf modified by fit_transform() call!
l1 <- dtm_train_tfidf %>% find_freq_terms(lowfreq = 5) %>%
colSums() %>% median()
l1_terms <- (dtm_train_tfidf %>% find_freq_terms(lowfreq = 5) %>%
colSums() > l1) %>% names
# compute the table of terms:
# term.table <- dtm %>% slam::col_sums()
# term.table <- sort(term.table, decreasing = TRUE)
term.table <- setNames(vocab$vocab$terms_counts, vocab$vocab$terms)
# remove terms that are stop words or occur fewer than 5 times:
# del <- term.table < 5
# term.table <- term.table[!del]
# vocab <- names(term.table)
get_terms <- function(doc.list, vocab) {
index <- match(doc.list, vocab)
index <- index[!is.na(index)]
rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
}
documents <- mclapply(doc.list, get_terms, vocab=l1_terms, mc.cores = 3)
# Compute some statistics related to the data set:
D <- length(documents) # number of documents (2,000)
W <- length(vocab) # number of terms in the vocab (14,568)
doc.length <- sapply(documents, function(x) sum(x[2, ])) # number of tokens per document [312, 288, 170, 436, 291, ...]
N <- sum(doc.length) # total number of tokens in the data (546,827)
term.frequency <- as.integer(term.table) # frequencies of terms in the corpus# 跑個模擬,挑一個好的主題數 -----------------------------------
doc.list <- ptt_dt[, post_text] %>%
mclapply(cutter, worker = mix_seg, mc.cores = 3) %>%
mclapply(function(x) x[!is.na(x)], mc.cores = 3)
dtm <- doc.list %>% seglist_to_dtm %>% filter_tfidf_dtm
# https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
tic <- Sys.time()
result <- FindTopicsNumber(
dtm,
topics = c(#seq(2, 6, by = 2),
seq(10, 60, by = 5),
seq(60, 100, by = 10)#,
# seq(120, 200, by = 20)
),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010"),
method = "Gibbs",
control = list(seed = Sys.time() %>% as.numeric()),
mc.cores = 3L,
verbose = TRUE
)
Sys.time() - tic
save(result, file = "models/ptt_kl_simmulation.RData")
FindTopicsNumber_plot(result)# Topic Model ----------------------------------
# vocab_temp <- dtm_train_tfidf %>% filter_tfidf_dtm() %>% colnames()
# term.table <- vocab_tbl$vocab %>%
# data.table() %>%
# .[terms %in% vocab_temp && terms >= 2]
# term.frequency <- term.table[, terms_counts]
# vocab <- term.table[, terms]
# MCMC and model tuning parameters:
K <- 55 # n_topic
G <- 3000 # num.iterations
alpha <- 0.02
eta <- 0.02
# Fit the model:
set.seed(2016)
t1 <- Sys.time()
lda_fit <- lda.collapsed.gibbs.sampler(
documents = documents, K = K, vocab = vocab,
num.iterations = G, alpha = alpha,
eta = eta, initial = NULL, burnin = 0,
compute.log.likelihood = TRUE)
t2 <- Sys.time()
t2 - t1 # about 2.899927 mins on server
# Save Result
save(lda_fit, file = "./models/ptt_keelung_lda_fit.RData")根據指標選擇 55 個 topic cluster
Result
library(lda)
load("./models/ptt_keelung_lda_fit.RData")
# Top topic result
top_docs_num <- lda_fit$document_sums %>% top.topic.documents(5)
top_words <- lda_fit$topics %>% top.topic.words(num.words = 6, by.score = TRUE) %>%
data.frame() %>% data.table()
top_words %>% DT::datatable()library(wordVectors)
# Prepare tokenizes text file
ptt_keelung_split <- post_text_seg %>%
sapply(paste, collapse = " ")
ptt_keelung_split %>% write_lines("data/tokenized/ptt_keelung_split.txt")
# Fit models
tic <- Sys.time()
vector_set <- train_word2vec(train_file = "data/tokenized/ptt_keelung_split.txt",
output_file = "models/ptt_keelung_word2vec.bin",
force = TRUE,
vectors = 100,
threads = parallel::detectCores()-1,
window = 12)
print(Sys.time() - tic)nearest_to(vector_set, vector_set[["景點"]], n = 20)nearest_to(vector_set, vector_set[["停車"]], n = 20)nearest_to(vector_set, vector_set[["交通"]], n = 20)nearest_to(vector_set,
vector_set[["遊客"]] - vector_set[["夜市"]] + vector_set[["本地人"]],
n = 10)nearest_to(vector_set,
vector_set[["基隆"]] - vector_set[["市長"]] + vector_set[["台北"]],
n = 10)nearest_to(vector_set,
vector_set[["基隆"]] - vector_set[["海洋廣場"]] + vector_set[["台北"]],
n = 10)- 基隆:交通=台北:?
nearest_to(vector_set,
vector_set[["基隆"]] - vector_set[["交通"]] + vector_set[["台北"]],
n = 10)nearest_to(vector_set,
vector_set[["基隆"]] - vector_set[["交通"]] + vector_set[["新北市"]],
n = 10)nearest_to(vector_set,
vector_set[["基隆"]] - vector_set[["河"]] + vector_set[["台北"]],
n = 10)